Door and Key Experiments in Curriculum Learning


Salkey, Jayson

26/07/2018


Setup

Import Useful Libraries

In [19]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import time
from collections import namedtuple
from scipy import stats

Set options

In [20]:
%matplotlib inline
np.set_printoptions(precision=3, suppress=1)

A hallway world

In [21]:
class Hallway(object):

  def __init__(self, goal_loc, tabular=True, vision_size=1, discount=0.98, noisy=False):
    # 10: Key
    # -2: Door
    # -1: wall
    # 0: empty, episode continues
    # other: number indicates reward, episode will terminate
    
    self._wall = -1
    self._door = -2
    self._key = 10
    
#     self._layout = np.array([
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#         [-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -2,  0, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1, -1,  0, -2,  0, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1],
#         [-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#       ])
    
    
    
#      ROOMS

    self._layout = np.array([
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -2, -2, -2, -2, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -2,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -2,  0,  0,  0, -1, -1],
        [-1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -2,  0,  0,  0, -1, -1],
        [-1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
      ])
  
  #     self._layout = np.array([
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#         [-1,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1],
#         [-1, -1,  0, -1,  0,  0, -1,  0,  0, -1,  0,  0,  0,  0,  0, -1, -1],
#         [-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -2, -2, -2, -1, -1],
#         [-1,  0,  0,  0, -1, -1, -1,  0,  0, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1,  0,  0,  0, -1, -1, -1,  0,  0, -1, -1, -1,  0,  0,  0, -1, -1],
#         [-1,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0, -1, -1],
#         [-1, -1, -1, -1, -1, -1,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1],
#         [-1, -1, -1, -1, -1, -1,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1],
#         [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
#       ])
    
    
    
    self._goals = set()
    self._goal_loc = []
    
    for e in goal_loc:
      self._layout[e[0],e[1]] = e[2]
      self._goal_loc.append((e[0],e[1]))
      self._goals.add(e[2])
    
    #self._goal = value
    
    # row, col format
#     self._start_state = (12, 8)
    self._start_state = (11, 8)
    self._state = self._start_state
    self._number_of_states = np.prod(np.shape(self._layout))
    self._noisy = noisy
    self._tabular = tabular
    self._vision_size = vision_size
    self._discount = discount
    self._distanceToGoal = None
    
    self._minDistanceStartGoal = self.minDistanceTwoPoints(self._start_state[0], self._start_state[1],
                                                          self._goal_loc[-1][0], self._goal_loc[-1][1])
    
    #self.distanceToGoal()
  def resetState(self):
    self._state = self._start_state
  
  def distanceToGoal(self):
    return np.prod(self._layout.shape)
    #return np.count_nonzero(self._layout != -1)*10
  
  def minDistanceTwoPoints(self, cy, cx, dy, dx):
    distances = []
    stack = []
    visited = set()
    stack.append((cy, cx, 0))
    while len(stack) != 0:
      #print len(stack)
      cur_row, cur_col, dist = stack.pop()
      if (cur_row, cur_col) in visited:
        continue
      visited.add((cur_row, cur_col))
      
      if (cur_row, cur_col) == (dy, dx):
        distances.append(dist)

      if cur_row+1 < self._layout.shape[0] and self._layout[cur_row+1, cur_col] != self._wall:
        stack.append((cur_row+1, cur_col, dist+1))
      if cur_row-1 > -1 and self._layout[cur_row-1, cur_col] != self._wall:
        stack.append((cur_row-1, cur_col, dist+1))
      if cur_col+1 < self._layout.shape[1] and self._layout[cur_row, cur_col+1] != self._wall:
        stack.append((cur_row, cur_col+1, dist+1))
      if cur_col-1 > -1 and self._layout[cur_row, cur_col-1] != self._wall:
        stack.append((cur_row, cur_col-1, dist+1))
    
    return np.min(np.array(distances))
  
  def distanceToNearestGoal(self, new_y, new_x):
    distances = []
    locations = []
    stack = []
    visited = set()
    stack.append((new_y, new_x, 0))
    while len(stack) != 0:
      #print len(stack)
      cur_row, cur_col, dist = stack.pop()
      if (cur_row, cur_col) in visited:
        continue
      visited.add((cur_row, cur_col))
      
      for e in self._goal_loc:
        if (cur_row, cur_col) == e:
          distances.append(dist)
          locations.append(e)
          

      if cur_row+1 < self._layout.shape[0] and self._layout[cur_row+1, cur_col] != self._wall:
        stack.append((cur_row+1, cur_col, dist+1))
      if cur_row-1 > -1 and self._layout[cur_row-1, cur_col] != self._wall:
        stack.append((cur_row-1, cur_col, dist+1))
      if cur_col+1 < self._layout.shape[1] and self._layout[cur_row, cur_col+1] != self._wall:
        stack.append((cur_row, cur_col+1, dist+1))
      if cur_col-1 > -1 and self._layout[cur_row, cur_col-1] != self._wall:
        stack.append((cur_row, cur_col-1, dist+1))
    
    # Has to be the absolute closest person
    argmin_dist = np.argmin(np.array(distances))
    #print(distances[argmin_dist], locations[argmin_dist])
    return distances[argmin_dist], locations[argmin_dist] 
    
  def handleDoor(self):
    pass
  
  @property
  def number_of_states(self):
    return self._number_of_states
    
  def plot_grid(self, title=None):
    plt.figure(figsize=(4, 4))
    plt.imshow(self._layout != self._wall, interpolation="nearest", cmap='pink')
    ax = plt.gca()
    ax.grid(0)
    plt.xticks([])
    plt.yticks([])
    
    if title != None:
      plt.title(title)
    else:
      plt.title("The Grid")
    plt.text(self._start_state[1], self._start_state[0], r"$\mathbf{S}$", ha='center', va='center')
    
    for e in self._goals:
      if e == self._key:
        y = np.where(self._layout==e)[0]
        x = np.where(self._layout==e)[1]
        for i in range(y.shape[0]): 
          plt.text(x[i], y[i], r"$\mathbf{K}$", ha='center', va='center')
      elif e > 0:
        y = np.where(self._layout==e)[0]
        x = np.where(self._layout==e)[1]
        for i in range(y.shape[0]): 
          plt.text(x[i], y[i], r"$\mathbf{G}$", ha='center', va='center')
    y = np.where(self._layout==self._door)[0]
    x = np.where(self._layout==self._door)[1]
    for i in range(y.shape[0]): 
      plt.text(x[i], y[i], r"$\mathbf{D}$", ha='center', va='center')
    
    h, w = self._layout.shape
    for y in range(h-1):
      plt.plot([-0.5, w-0.5], [y+0.5, y+0.5], '-k', lw=2)
    for x in range(w-1):
      plt.plot([x+0.5, x+0.5], [-0.5, h-0.5], '-k', lw=2)

  def get_obs(self):
    y, x = self._state
    return self.get_obs_at(x, y)

  def get_obs_at(self, x, y):
    if self._tabular:
      return y*self._layout.shape[1] + x
    else:
      v = self._vision_size
      #location = np.clip(-self._layout[y-v:y+v+1,x-v:x+v+1], 0, 1)
      #location = np.clip(-self._layout[y-v:y+v+1,x-v:x+v+1], -1, 2)
      location = self._layout[y-v:y+v+1,x-v:x+v+1]
      return location

  def step(self, action, agent_inventory):
    item = None
    y, x = self._state
        
    if action == 0:  # up
      new_state = (y - 1, x)
    elif action == 1:  # right
      new_state = (y, x + 1)
    elif action == 2:  # down
      new_state = (y + 1, x)
    elif action == 3:  # left
      new_state = (y, x - 1)
    else:
      raise ValueError("Invalid action: {} is not 0, 1, 2, or 3.".format(action))

    new_y, new_x = new_state
    discount = self._discount
    if self._layout[new_y, new_x] == self._wall:  # a wall
      reward = -1
      new_state = (y, x)
    elif self._layout[new_y, new_x] == self._key: # a key
      reward = self._layout[new_y, new_x]
      item = 'KEY'
      discount = 0.
      new_state = self._start_state
      print('PICKED UP KEY')
    elif self._layout[new_y, new_x] == self._door: # a door
      reward = 5
      if 'KEY' not in agent_inventory:
        reward = self._layout[new_y, new_x]
        new_state = (y, x)
    elif self._layout[new_y, new_x] > 0: # a goal
      reward = self._layout[new_y, new_x]
      discount = 0.
      new_state = self._start_state
    else:
      
#       reward = 0.
      distToNearestGoal, nearestGoal = self.distanceToNearestGoal(new_y, new_x)
      if len(self._goal_loc) != 1:
        minDistance = self.minDistanceTwoPoints(self._start_state[0], self._start_state[1], nearestGoal[0], nearestGoal[1])    
      else:
        minDistance = self._minDistanceStartGoal
      
      distToNearestGoal = float(distToNearestGoal)
      minDistance = float(minDistance)
      
      #print(self._layout[nearestGoal[0],nearestGoal[1]], distToNearestGoal, minDistance, distToNearestGoal/minDistance, -distToNearestGoal/minDistance, np.exp(-distToNearestGoal/minDistance))
      reward = self._layout[nearestGoal[0],nearestGoal[1]]*np.exp(-distToNearestGoal/minDistance)
      if self._layout[nearestGoal[0],nearestGoal[1]] == 100 and 'KEY' not in agent_inventory:
        reward = 0.
#       elif self._layout[nearestGoal[0],nearestGoal[1]] == 100 and 'KEY' in agent_inventory:
#         print agent_inventory
        
    if self._noisy:
      width = self._layout.shape[1]
      reward += 2*np.random.normal(0, width - new_x + new_y)
    
    self._state = new_state

    return reward, discount, self.get_obs(), item

The Hallway(s)

In [22]:
tasks = []
tasks.append(Hallway(goal_loc = [(8,2,10),(2,2,5),(2,14,5),(8,14,5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 2, 10)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 13, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 14, 5)], discount=0.98))
for idt,task in enumerate(tasks):
  task.plot_grid(title="grid_{}".format(idt) )

Implement agents

In [23]:
class GeneralQ(object):

  def __init__(self, number_of_states, number_of_actions, initial_state, target_policy, behaviour_policy, double, num_offline_updates=30, step_size=0.1):
    self._q = np.zeros((number_of_states, number_of_actions))
    self._replayBuffer_A = []
    if double:
      self._q2 = np.zeros((number_of_states, number_of_actions))
      self._replayBuffer_B = []
    self._s = initial_state
    self._initial_state = initial_state
    self._number_of_actions = number_of_actions
    self._step_size = step_size
    self._behaviour_policy = behaviour_policy
    self._target_policy = target_policy
    self._double = double
    self._num_offline_updates = num_offline_updates
    self._last_action = 0
    self._inventory = set()
  
  def resetReplayBuffer(self):
    self._replayBuffer_A = []
    if self._double:
      self._replayBuffer_B = []
      
  @property
  def q_values(self):
    if self._double:
      return (self._q + self._q2)/2
    else:
      return self._q
    
  def resetState(self):
    self._s = self._initial_state 

  def step(self, r, g, s, item, train):
    td = None
    
    if item != None:
      self._inventory.add(item)
      
    if self._double:
      next_action = self._behaviour_policy(self.q_values[s,:], train)
      if np.random.random() <= 0.5:
        expectation = np.sum(self._target_policy(self._q[s,:], next_action) * self._q2[s,:])
        td = self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
        if train == True:
          self._q[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
          #self._q[self._s,self._last_action] += self._step_size*(r + g*self._q2[s,np.argmax(target_policy(self._q[s,:], next_action))] - self._q[self._s,self._last_action])
          self._replayBuffer_A.append([self._s,self._last_action,r,g,s,next_action])

          for _ in range(self._num_offline_updates):
            replay = self._replayBuffer_A[np.random.randint(len(self._replayBuffer_A))]
            expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
            self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])

      else:
        expectation = np.sum(self._target_policy(self._q2[s,:], next_action) * self._q[s,:])
        td = self._step_size*(r + g*expectation - self._q2[self._s,self._last_action])
        if train == True:
          self._q2[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q2[self._s,self._last_action])   
          #self._q2[self._s,self._last_action] += self._step_size*(r + g*self._q[s,np.argmax(target_policy(self._q2[s,:], next_action))] - self._q2[self._s,self._last_action])    
          self._replayBuffer_B.append([self._s,self._last_action,r,g,s,next_action])

          for _ in range(self._num_offline_updates):
            replay = self._replayBuffer_B[np.random.randint(len(self._replayBuffer_B))]
            expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
            self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])

      self._s = s
      self._last_action = next_action
      return self._last_action, self._inventory, td
    else:
      next_action = self._behaviour_policy(self._q[s,:], train)
      # This is expected sarsa, but still functions as expected.
      expectation = np.sum(self._target_policy(self._q[s,:], next_action) * self._q[s,:])
      td = self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
      if train == True:
        self._q[self._s,self._last_action] += self._step_size*(r + g*expectation - self._q[self._s,self._last_action])
        #self._q[self._s,self._last_action] += self._step_size*(r + g*self._q[s,np.argmax(target_policy(self._q[s,:], next_action))] - self._q[self._s,self._last_action])
        self._replayBuffer_A.append([self._s,self._last_action,r,g,s,next_action])

        for _ in range(self._num_offline_updates):
          replay = self._replayBuffer_A[np.random.randint(len(self._replayBuffer_A))]
          expectation = np.sum(self._target_policy(self._q[replay[4],:], replay[5]) * self._q2[replay[4],:])
          self._q[replay[0],replay[1]] += self._step_size*(replay[2] + replay[3] * expectation - self._q[replay[0],replay[1]])

      self._s = s
      self._last_action = next_action
      #print(self._inventory)
      return self._last_action, self._inventory, td
    
In [24]:
def Q_target_policy(q, a):
  return np.eye(len(q))[np.argmax(q)]

def SARSA_target_policy(q, a):
  return np.eye(len(q))[a]

def gen_behaviour_policy(q, train):
  #return epsilon_greedy(q, 0.1) if train == True else np.random.choice(np.where(np.max(q) == q)[0])
  return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)

An agent that uses Neural-Sarsa/DQN to learn action values. The agent should expect a nxn input which it should flatten into a vector, and then pass through a multi-layer perceptron with a single hidden layer with 100 hidden nodes and ReLU activations. Each weight layer should also have a bias. Initialize all weights uniformly randomly in $[-0.05, 0.05]$.

NeuralSarsa(number_of_features=(2*vision_size + 1)**2,
            number_of_hidden=100,
            number_of_actions=4,
            initial_state=grid.get_obs(),
            step_size=0.01)

DQN(number_of_features=(2*vision_size + 1)**2,
            number_of_hidden=100,
            number_of_actions=4,
            initial_state=grid.get_obs(),
            step_size=0.01)

The number vision_size will be either 1 or 2 below. The input vector will be of size $(2v + 1)^2$, which will correspond to a square local view of the grid, centered on the agent, and of size $(2v + 1) \times (2v + 1)$ (so either 3x3 or 5x5).

In [25]:
class NEURAL_CONTROLLER_DRIVER(object):
  
  # Target Network is the same, as C-step is just C=1
  
  def __init__(self, number_of_features_controller,
                number_of_features_driver,
                number_of_hidden_controller,
                number_of_hidden_driver,
                number_of_actions_controller,
                number_of_actions_driver,
                initial_state_controller,
                initial_state_driver, 
                rl_alg_controller='DQN',
                rl_alg_driver='DQN', 
                num_offline_updates_controller=20, 
                num_offline_updates_driver=25,
                step_size_controller=0.01,
                step_size_driver=0.01): 
    # HMMM?
    tf.reset_default_graph()

    self._prev_action_driver = 0
    self._step_driver = step_size_driver
    self._num_features_driver = number_of_features_driver
    self._num_action_driver = number_of_actions_driver
    self._num_hidden_driver = number_of_hidden_driver
    self._initial_state_driver = initial_state_driver
    self._s_driver = initial_state_driver
    self._s_driver = np.reshape(self._s_driver, (1,-1))
    self._times_trained_driver = 0
    self._inventory = set()
    self._replayBuffer_driver = []
    self._num_offline_updates_driver = num_offline_updates_driver
    self._rl_alg_driver = rl_alg_driver



    
    self._prev_action_controller = 0
    self._step_controller = step_size_controller
    self._num_features_controller = number_of_features_controller
    self._num_action_controller = number_of_actions_controller
    self._num_hidden_controller = number_of_hidden_controller
    self._initial_state_controller = initial_state_controller
    self._s_controller = initial_state_controller
    self._s_controller = np.reshape(self._s_controller, (1,-1))
    self._times_trained_controller = 0
    self._replayBuffer_controller = []
    self._num_offline_updates_controller = num_offline_updates_controller
    self._rl_alg_controller = rl_alg_controller
    self.name = 'HYPER '+self._rl_alg_controller
  
    # ?????????? should it be the number of tasks
    self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
    
    
    self._times_used = 0.
    
    self.handleTF()
  
  def reset(self):
    tf.reset_default_graph()
    self.handleTF()
    self.resetState_controller()
    self.resetReplayBuffer_controller()
    self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
    self._times_trained_controller = 0
    self._prev_action_controller = 0
    self.resetReplayBuffer()
    self.resetState()
    self._times_trained_driver = 0
    self._prev_action_driver = 0
    self._inventory = set()
    self._times_used = 0

  def resetReplayBuffer_controller(self):
    self._replayBuffer_controller = []
    
  def resetState_controller(self):
    self._s_controller = self._initial_state_controller 
    self._s_controller = np.reshape(self._s_controller, (1,-1))
    
  def handleTF(self):
    self._sess = tf.Session()
    #tf.reset_default_graph()
    self.rewTensor_controller = tf.placeholder(tf.float64)
    self.disTensor_controller = tf.placeholder(tf.float64)
    self.nqTensor_controller = tf.placeholder(tf.float64)
    self.actionTensor_controller = tf.placeholder(tf.int32)
    self.stateTensor_controller = tf.placeholder(tf.float64, shape=(1,self._num_features_controller))
    self._dense_1_controller = tf.layers.dense(self.stateTensor_controller,
                                    self._num_hidden_controller, activation=tf.nn.relu,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._dense_2_controller = tf.layers.dense(self._dense_1_controller,
                                    self._num_action_controller, activation=None,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._q_controller = tf.reshape(self._dense_2_controller, (self._num_action_controller,))    
    self._softmx_controller = tf.nn.softmax(self._q_controller)
    self._cost_controller = tf.losses.mean_squared_error(self.rewTensor_controller + self.disTensor_controller*self.nqTensor_controller, self._q_controller[self.actionTensor_controller])
    self._opt_controller = tf.train.RMSPropOptimizer(self._step_controller).minimize(self._cost_controller) 
    

    self.rewTensor_driver = tf.placeholder(tf.float64)
    self.disTensor_driver = tf.placeholder(tf.float64)
    self.nqTensor_driver = tf.placeholder(tf.float64)
    self.actionTensor_driver = tf.placeholder(tf.int32)
    self.stateTensor_driver = tf.placeholder(tf.float64, shape=(1,self._num_features_driver))
    self._dense_1_driver = tf.layers.dense(self.stateTensor_driver,
                                    self._num_hidden_driver, activation=tf.nn.relu,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._dense_2_driver = tf.layers.dense(self._dense_1_driver,
                                    self._num_action_driver, activation=None,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._q_driver = tf.reshape(self._dense_2_driver, (self._num_action_driver,))    
    self._cost_driver = tf.losses.mean_squared_error(self.rewTensor_driver+ self.disTensor_driver*self.nqTensor_driver, self._q_driver[self.actionTensor_driver])
    self._opt_driver = tf.train.RMSPropOptimizer(self._step_driver).minimize(self._cost_driver)


    # HMMM?
    self._sess.run(tf.global_variables_initializer())

  def _target_policy_controller(self, q, a):
    return np.eye(len(q))[a]
 
  def _behaviour_policy_controller(self, q):    
    return epsilon_greedy(q, 0.2)# if train == True else np.random.choice(np.where(np.max(q) == q)[0])

  def getProbs(self):
    # softmax
    return self._probs_controller

  def q_controller(self, obs):
    #print [n.name for n in tf.get_default_graph().as_graph_def().node]
    obs = np.reshape(obs,(1,-1))
    #print obs
    t, probs = self._sess.run([self._q_controller, self._softmx_controller], {self.stateTensor_controller: obs})
    return t, probs
  
  def step_controller(self, r, g, s):
    self._times_used += 1
    #print self._times_used
    qvs, probs = self.q_controller(s)
    q_nxtState = np.reshape(qvs, (-1,))
    self._probs_controller = probs
    next_action = self._behaviour_policy_controller(q_nxtState)
    
    if r != None:
      if self._rl_alg_controller == 'NEURALSARSA':
        target = self._target_policy_controller(q_nxtState, next_action)
        target = np.random.choice(np.where(np.max(target) == target)[0])
        vob = q_nxtState[target]
        #print vob
        self._sess.run(self._opt_controller,{
            self.nqTensor_controller: vob,
            self.rewTensor_controller: r,
            self.disTensor_controller: g,
            self.actionTensor_controller: self._prev_action_controller,
            self.stateTensor_controller: self._s_controller})
        self._replayBuffer_controller.append([self._s_controller, self._prev_action_controller, r, g, vob])
        for _ in range(self._num_offline_updates_controller):
          replay = self._replayBuffer_controller[np.random.randint(len(self._replayBuffer_controller))]
          self._sess.run(self._opt_controller,{
              self.nqTensor_controller: replay[4],
              self.rewTensor_controller: replay[2],
              self.disTensor_controller: replay[3],
              self.actionTensor_controller: replay[1],
              self.stateTensor_controller: replay[0]})
      elif self._rl_alg_controller == 'DQN':
        # This function should return an action
        # Optimiser
        vob = np.max(q_nxtState)
        self._sess.run(self._opt_controller,{
            self.nqTensor_controller: vob,
            self.rewTensor_controller: r,
            self.disTensor_controller: g,
            self.actionTensor_controller: self._prev_action_controller,
            self.stateTensor_controller: self._s_controller})
        self._replayBuffer_controller.append([self._s_controller, self._prev_action_controller, r, g, vob])
        for _ in range(self._num_offline_updates_controller):
          replay = self._replayBuffer_controller[np.random.randint(len(self._replayBuffer_controller))]
          self._sess.run(self._opt_controller,{
              self.nqTensor_controller: replay[4],
              self.rewTensor_controller: replay[2],
              self.disTensor_controller: replay[3],
              self.actionTensor_controller: replay[1],
              self.stateTensor_controller: replay[0]})

    self._s_controller = np.reshape(s, (1,-1))
    self._prev_action_controller = next_action
    
    return next_action

  def reset_controller(self):
    tf.reset_default_graph()
    self.handleTF()
    self.resetState_controller()
    self.resetReplayBuffer_controller()
    self._probs_controller = np.ones((1, self._num_features_controller))/(self._num_features_controller*1.)
    self._times_trained_controller = 0
    self._prev_action_controller = 0



    # resetReplayBuffer_driver
  def resetReplayBuffer(self):
    self._replayBuffer_driver = []
    
    # resetState_driver
  def resetState(self):
    self._s_driver = self._initial_state_driver 
    self._s_driver = np.reshape(self._s_driver, (1,-1))


  def _target_policy_driver(self, q, a):
    return np.eye(len(q))[a]
 
  def _behaviour_policy_driver(self, q, train):
    #return epsilon_greedy(q, 0.1) if train == True else np.random.choice(np.where(np.max(q) == q)[0])
    return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)

  def q_driver(self, obs):
    #print [n.name for n in tf.get_default_graph().as_graph_def().node]
    obs = np.reshape(obs,(1,-1))
    #print obs
    t = self._sess.run(self._q_driver, {self.stateTensor_driver: obs})
    return t
  
  # step_driver
  def step(self, r, g, s, item, train):
    cost = None
    
    if item != None:
      self._inventory.add(item)
    
    # This function should return an action
    q_nxtState = np.reshape(self.q_driver(s), (-1,))
    next_action = self._behaviour_policy_driver(q_nxtState, train)
    

    if self._rl_alg_driver == 'NEURALSARSA':
      target = self._target_policy_driver(q_nxtState, next_action)
      target = np.random.choice(np.where(np.max(target) == target)[0])
      
      # Optimiser
      vob = q_nxtState[target]
#       cost = self._sess.run(self._cost_driver,{
#           self.nqTensor_driver: vob,
#           self.rewTensor_driver: r,
#           self.disTensor_driver: g,
#           self.actionTensor_driver: self._prev_action_driver,
#           self.stateTensor_driver: self._s_driver})
      if train == True:
        self._sess.run(self._opt_driver,{
            self.nqTensor_driver: vob,
            self.rewTensor_driver: r,
            self.disTensor_driver: g,
            self.actionTensor_driver: self._prev_action_driver,
            self.stateTensor_driver: self._s_driver})
        self._replayBuffer_driver.append([self._s_driver, self._prev_action_driver, r, g, vob])
        for _ in range(self._num_offline_updates_driver):
          replay = self._replayBuffer_driver[np.random.randint(len(self._replayBuffer_driver))]
          self._sess.run(self._opt_driver,{
              self.nqTensor_driver: replay[4],
              self.rewTensor_driver: replay[2],
              self.disTensor_driver: replay[3],
              self.actionTensor_driver: replay[1],
              self.stateTensor_driver: replay[0]})
    elif self._rl_alg_driver == 'DQN':
      vob = np.max(q_nxtState)
#       cost = self._sess.run(self._cost_driver,{
#               self.nqTensor_driver: vob,
#               self.rewTensor_driver: r,
#               self.disTensor_driver: g,
#               self.actionTensor_driver: self._prev_action_driver,
#               self.stateTensor_driver: self._s_driver})
      if train == True:
        self._sess.run(self._opt_driver,{
            self.nqTensor_driver: vob,
            self.rewTensor_driver: r,
            self.disTensor_driver: g,
            self.actionTensor_driver: self._prev_action_driver,
            self.stateTensor_driver: self._s_driver})
        self._replayBuffer_driver.append([self._s_driver, self._prev_action_driver, r, g, vob])
        for _ in range(self._num_offline_updates_driver):
          replay = self._replayBuffer_driver[np.random.randint(len(self._replayBuffer_driver))]
          self._sess.run(self._opt_driver,{
              self.nqTensor_driver: replay[4],
              self.rewTensor_driver: replay[2],
              self.disTensor_driver: replay[3],
              self.actionTensor_driver: replay[1],
              self.stateTensor_driver: replay[0]})

    
        
    self._s_driver = np.reshape(s, (1,-1))
    self._prev_action_driver = next_action
    return next_action, self._inventory, cost

Agent 0: Random

In [26]:
class Random(object):
  """A random agent.
  
  This agent returns an action between 0 and 'number_of_arms', 
  uniformly at random. The 'previous_action' argument of 'step'
  is ignored.
  """

  def __init__(self, number_of_arms):
    self._number_of_arms = number_of_arms
    self.name = 'random'
    self.reset()

  def step(self, previous_action, reward):
    return np.random.randint(self._number_of_arms)
  
  def getProbs(self):
    return np.ones((self._number_of_arms))/self._number_of_arms
  
  def reset(self):
    pass

Agent 1: REINFORCE

In [27]:
class REINFORCE(object):
 
  def __init__(self, number_of_arms, step_size=0.1, baseline=False):
    self._number_of_arms = number_of_arms
    self._lr = step_size
    self.name = 'reinforce, baseline: {}'.format(baseline)
    self._baseline = baseline
    self.action_values = np.zeros((2,self._number_of_arms))
    self.action_preferences = np.zeros((1,self._number_of_arms))
    self.total_reward = 0;
    self.number_rewards = 0.
    self.reset()
  
  def step(self, previous_action, reward):
    if previous_action != None:
      self.number_rewards += 1.
      self.total_reward += reward
      self.action_values[0,previous_action] += reward
      self.action_values[1,previous_action] += 1.
      self.updatePreferences(previous_action, reward)
#    unvisited = np.where(self.action_values[1,:] == 0.)
#     if unvisited[0].size > 0:
#       return unvisited[0][0]
#     else:
#       return np.random.choice(np.arange(0,self._number_of_arms),p=self.softmax())
    return np.random.choice(np.arange(0,self._number_of_arms),p=self.softmax())
    
  def reset(self):
    self.action_values = np.zeros((2,self._number_of_arms))
    self.action_preferences = np.zeros((1,self._number_of_arms))
    self.number_rewards = 0.
    self.total_reward = 0.
  
  def updatePreferences(self, previous_action, reward):
    if not self._baseline: 
      self.action_preferences[0,previous_action]+=self._lr*reward*(1-self.softmax()[previous_action])
      for i in range(0,self._number_of_arms):
        if i != previous_action:
          self.action_preferences[0,i]-=self._lr*reward*self.softmax()[i]
    else:
      self.action_preferences[0,previous_action]+=self._lr*(reward - self.total_reward/self.number_rewards)*(1-self.softmax()[previous_action])
      for i in range(0,self._number_of_arms):
        if i != previous_action:
          self.action_preferences[0,i]-=self._lr*(reward - self.total_reward/self.number_rewards)*self.softmax()[i]
    
  def softmax(self):
    q = np.sum(np.exp(self.action_preferences),axis=1)
    t = np.exp(self.action_preferences)/q
    return t.flatten()
  
  def getProbs(self):
    return self.softmax()

Agent 2: EXP3

In [28]:
class EXP3(object):

  def __init__(self, number_of_arms, gamma):
    self._number_of_arms = number_of_arms
    self.name = 'exp3 Gamma: ' + str(gamma)
    
    self.action_values = np.zeros((2,self._number_of_arms))
    
    self.gamma = gamma
    self.weights = np.ones((1,self._number_of_arms))
    
    self.time = 0.
    self.reset()
  
  def step(self, previous_action, reward):
    if previous_action != None:
      xhat = np.zeros((1, self._number_of_arms))
      xhat[0,previous_action] = reward/self.action_values[0,previous_action]
      self.weights = self.weights*np.exp(self.gamma*xhat/self._number_of_arms)
      self.action_values[1,previous_action] += 1.
    self.action_values[0,:] = (1-self.gamma)*(self.weights)/(np.sum(self.weights)) + self.gamma/self._number_of_arms
    action = np.random.choice(self._number_of_arms, p=self.action_values[0,:])
    self.time += 1.
    unvisited = np.where(self.action_values[1,:] == 0.)
    return unvisited[0][0] if unvisited[0].size > 0 else action
  
  def getProbs(self):
    return self.action_values[0,:]
  
  def reset(self):
    self.action_values = np.zeros((2, self._number_of_arms))
    self.weights = np.ones((1, self._number_of_arms))
    self.time = 0
    return

Agent 7: Task Selection via RL

In [29]:
class NS_DQN(object):
  
  # Target Network is the same, as C-step is just C=1
  
  def __init__(self, teacher_student, number_of_features, number_of_hidden, number_of_actions, initial_state, rl_alg='DQN', num_offline_updates=30, step_size=0.01):
    
    tf.reset_default_graph()
    self._prev_action = 0
    self._step = step_size
    self._num_features = number_of_features
    self._num_action = number_of_actions
    self._num_hidden = number_of_hidden
    self._initial_state = initial_state
    self._s = initial_state
    self._s = np.reshape(self._s, (1,-1))
    self._times_trained = 0
    self._replayBuffer = []
    self._num_offline_updates = num_offline_updates
    self._rl_alg = rl_alg
    self._teacher_student = teacher_student
    self._probs = np.ones((1, self._num_action))/(self._num_action*1.)
    self._inventory = set()
    
    if self._teacher_student == True:
      self.name = 'HYPER '+self._rl_alg
    else:
      self.name = self._rl_alg
    
    self.handleTF()
  
  def resetReplayBuffer(self):
    self._replayBuffer = []
    
  def resetState(self):
    self._s = self._initial_state 
    self._s = np.reshape(self._s, (1,-1))
    
  def handleTF(self):
    self._sess = tf.Session()
    #tf.reset_default_graph()
    self.rewTensor = tf.placeholder(tf.float64)
    self.disTensor = tf.placeholder(tf.float64)
    self.nqTensor = tf.placeholder(tf.float64)
    self.actionTensor = tf.placeholder(tf.int32)
    self.stateTensor = tf.placeholder(tf.float64, shape=(1,self._num_features))
    self._dense_1 = tf.layers.dense(self.stateTensor,
                                    self._num_hidden, activation=tf.nn.relu,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._dense_2 = tf.layers.dense(self._dense_1,
                                    self._num_action, activation=None,
                                    kernel_initializer=tf.random_uniform_initializer(-0.05, 0.05),
                                    bias_initializer=tf.random_uniform_initializer(-0.05, 0.05))
    self._q = tf.reshape(self._dense_2, (self._num_action,))    
    self._softmx = tf.nn.softmax(self._q)
    self._cost = tf.losses.mean_squared_error(self.rewTensor + self.disTensor*self.nqTensor, self._q[self.actionTensor])
    self._opt = tf.train.RMSPropOptimizer(self._step).minimize(self._cost) 
    # HMMM?
    self._sess.run(tf.global_variables_initializer())

  def _target_policy(self, q, a):
    return np.eye(len(q))[a]
 
  def _behaviour_policy(self, q, train):
    if self._teacher_student == True:   
      return epsilon_greedy(q, 0.2)#if train == True else np.random.choice(np.where(np.max(q) == q)[0])
    else:
      return epsilon_greedy(q, 0.2) if train == True else epsilon_greedy(q, 0.05)
  
  def getProbs(self):
    # softmax
    return self._probs
  
  def q_noProbs(self, obs):
    obs = np.reshape(obs,(1,-1))
    t = self._sess.run(self._q, {self.stateTensor: obs})
    return t
  
  def q(self, obs):
    obs = np.reshape(obs,(1,-1))
    t, probs = self._sess.run([self._q, self._softmx], {self.stateTensor: obs})
    return t, probs
  
  def step(self, r, g, s, item=None, train=True):
    cost = None

    if item != None:
      self._inventory.add(item)

    qvs, probs = self.q(s)
    q_nxtState = np.reshape(qvs, (-1,))
    self._probs = probs
    next_action = self._behaviour_policy(q_nxtState, train)
    
    if r != None:
      if self._rl_alg == 'NEURALSARSA':
        target = self._target_policy(q_nxtState, next_action)
        target = np.random.choice(np.where(np.max(target) == target)[0])
        vob = q_nxtState[target]

        if train == True:
          self._sess.run(self._opt,{
              self.nqTensor: vob,
              self.rewTensor: r,
              self.disTensor: g,
              self.actionTensor: self._prev_action,
              self.stateTensor: self._s})
          self._replayBuffer.append([self._s, self._prev_action, r, g, vob])
          for _ in range(self._num_offline_updates):
            replay = self._replayBuffer[np.random.randint(len(self._replayBuffer))]
            self._sess.run(self._opt,{
                self.nqTensor: replay[4],
                self.rewTensor: replay[2],
                self.disTensor: replay[3],
                self.actionTensor: replay[1],
                self.stateTensor: replay[0]})
      elif self._rl_alg == 'DQN':
        vob = np.max(q_nxtState)

        if train == True:
          self._sess.run(self._opt,{
              self.nqTensor: vob,
              self.rewTensor: r,
              self.disTensor: g,
              self.actionTensor: self._prev_action,
              self.stateTensor: self._s})
          self._replayBuffer.append([self._s, self._prev_action, r, g, vob])
          for _ in range(self._num_offline_updates):
            replay = self._replayBuffer[np.random.randint(len(self._replayBuffer))]
            self._sess.run(self._opt,{
                self.nqTensor: replay[4],
                self.rewTensor: replay[2],
                self.disTensor: replay[3],
                self.actionTensor: replay[1],
                self.stateTensor: replay[0]})

    self._s = np.reshape(s, (1,-1))
    self._prev_action = next_action
    return next_action, self._inventory, cost

  def reset(self):
    tf.reset_default_graph()
    self.handleTF()
    self.resetState()
    self.resetReplayBuffer()
    self._probs = np.ones((1, self._num_action))/(self._num_action*1.)
    self._times_trained = 0
    self._prev_action = 0
    self._inventory = set()

Task Selector

In [30]:
class TaskSelector(object):
  """An adversarial multi-armed Task bandit."""
  
  def __init__(self, rl_agent, tasks, reward_signal, number_of_tasks_selection_steps):
    self._unscaled_reward_history = []
    self._rl_agent = rl_agent
    self._tasks = tasks
    self._reward_signal = reward_signal
    self._tasks_buffer = np.zeros((5,len(tasks)))
    self._tasks_buffer_scaled = np.zeros((5,len(tasks)))
    self._tasks_episodes_completed_train = np.zeros((len(tasks), number_of_tasks_selection_steps))
    self._tasks_episodes_completed_test = np.zeros((len(tasks), number_of_tasks_selection_steps))
    self._time = 0
    self._number_of_tasks_selection_steps = number_of_tasks_selection_steps
    
  def resetReplayBuffer(self):
    self._rl_agent.resetReplayBuffer()    
  
  def step(self, action_task_id):
    
    if self._reward_signal == 'PG':
      _, _, ep_comp_train = run_step(self._tasks[action_task_id], self._rl_agent, True)
      reward_after, reward_steps_after, ep_comp_test = run_step(self._tasks[action_task_id], self._rl_agent, False)
      
      self._tasks_episodes_completed_train[action_task_id, self._time] = \
      self._tasks_episodes_completed_train[action_task_id, self._time-1]+1 if ep_comp_train == True else self._tasks_episodes_completed_train[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_train.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_train[x, self._time] = self._tasks_episodes_completed_train[x, self._time-1]
      
      self._tasks_episodes_completed_test[action_task_id, self._time] = \
      self._tasks_episodes_completed_test[action_task_id, self._time-1]+1 if ep_comp_test == True else self._tasks_episodes_completed_test[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_test.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_test[x, self._time] = self._tasks_episodes_completed_test[x, self._time-1]
          
      
    elif self._reward_signal == 'GPG':
      pass
    elif self._reward_signal == 'SPG':
      _, _, ep_comp_train = run_step(self._tasks[action_task_id], self._rl_agent, True)
      reward_after, reward_steps_after, ep_comp_test = run_step(self._tasks[action_task_id], self._rl_agent, False)
      
      self._tasks_episodes_completed_train[action_task_id, self._time] = \
      self._tasks_episodes_completed_train[action_task_id, self._time-1]+1 if ep_comp_train == True else self._tasks_episodes_completed_train[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_train.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_train[x, self._time] = self._tasks_episodes_completed_train[x, self._time-1]
      
      self._tasks_episodes_completed_test[action_task_id, self._time] = \
      self._tasks_episodes_completed_test[action_task_id, self._time-1]+1 if ep_comp_test == True else self._tasks_episodes_completed_test[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_test.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_test[x, self._time] = self._tasks_episodes_completed_test[x, self._time-1]
          
       
    elif self._reward_signal == 'TPG':
      _, _, ep_comp_train = run_step(self._tasks[action_task_id], self._rl_agent, True)
      reward_after, reward_steps_after, ep_comp_test = run_step(self._tasks[-1], self._rl_agent, False)
      
      self._tasks_episodes_completed_train[action_task_id, self._time] = \
      self._tasks_episodes_completed_train[action_task_id, self._time-1]+1 if ep_comp_train == True else self._tasks_episodes_completed_train[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_train.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_train[x, self._time] = self._tasks_episodes_completed_train[x, self._time-1]
      
      self._tasks_episodes_completed_test[-1, self._time] = \
      self._tasks_episodes_completed_test[-1, self._time-1]+1 if ep_comp_test == True else self._tasks_episodes_completed_test[-1, self._time-1] 
      for x in range(self._tasks_episodes_completed_test.shape[0]-1):
        self._tasks_episodes_completed_test[x, self._time] = self._tasks_episodes_completed_test[x, self._time-1]
          
        
    
    elif self._reward_signal == 'MPG':
      uniform_sampled_task_id = np.random.choice(len(self._tasks))
      _, _, ep_comp_train = run_step(self._tasks[action_task_id], self._rl_agent, True)
      reward_after, reward_steps_after, ep_comp_test = run_step(self._tasks[action_task_id], self._rl_agent, False)
      
      self._tasks_episodes_completed_train[action_task_id, self._time] = \
      self._tasks_episodes_completed_train[action_task_id, self._time-1]+1 if ep_comp_train == True else self._tasks_episodes_completed_train[action_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_train.shape[0]):
        if x != action_task_id:
          self._tasks_episodes_completed_train[x, self._time] = self._tasks_episodes_completed_train[x, self._time-1]
      
      self._tasks_episodes_completed_test[uniform_sampled_task_id, self._time] = \
      self._tasks_episodes_completed_test[uniform_sampled_task_id, self._time-1]+1 if ep_comp_test == True else self._tasks_episodes_completed_test[uniform_sampled_task_id, self._time-1] 
      for x in range(self._tasks_episodes_completed_test.shape[0]):
        if x != uniform_sampled_task_id:
          self._tasks_episodes_completed_test[x, self._time] = self._tasks_episodes_completed_test[x, self._time-1]
          
        
        
    elif self._reward_signal == 'VCG':
      pass
    elif self._reward_signal == 'GVCG':
      pass
    elif self._reward_signal == 'L2G':
      pass      
    
    self._time += 1
    self._tasks_buffer[:,action_task_id] = np.roll(self._tasks_buffer[:,action_task_id], 1)
    self._tasks_buffer[0,action_task_id] = reward_after
    X = np.arange(self._tasks_buffer.shape[0])
    slope, _, _, _, _ = stats.linregress(X, self._tasks_buffer[:,action_task_id])
    rhat = slope 
    
    self._unscaled_reward_history.append(rhat)
    temp_history = np.array(sorted(self._unscaled_reward_history))
    p_20 = np.percentile(temp_history, 20)
    p_80 = np.percentile(temp_history, 80)        

    if action_task_id < 0 or action_task_id >= len(self._tasks):
      raise ValueError('Action {} is out of bounds for a '
                       '{}-armed bandit'.format(action_task_id, len(split_train_tasks)))
    
    r = None
    if rhat <= p_20:
      r = -1.
    elif rhat > p_80:
      r = 1.
    else:
      r = 2.0 * (rhat - p_20)/(p_80 - p_20) - 1.
      
    self._tasks_buffer_scaled[:,action_task_id] = np.roll(self._tasks_buffer_scaled[:,action_task_id], 1)
    self._tasks_buffer_scaled[0,action_task_id] = r
    
    #print reward_steps_after
    # Perhaps, plot the variance or something else, because the train==False fucks this plot up
    return r, reward_after, np.reshape(self._tasks_buffer_scaled.T,(-1,))
In [31]:
def plot_values(values, colormap='pink', vmin=None, vmax=None):
  plt.imshow(values, interpolation="nearest", cmap=colormap, vmin=vmin, vmax=vmax)
  plt.yticks([])
  plt.xticks([])
  plt.colorbar(ticks=[vmin, vmax])

def plot_action_values(action_values, title, vmin=None, vmax=None):
  q = action_values
  fig = plt.figure(figsize=(10, 10))
  fig.subplots_adjust(wspace=0.3, hspace=0.3)
  vmin = np.min(action_values)
  vmax = np.max(action_values)
  #print vmin, vmax
  dif = vmax - vmin
  for a in [0, 1, 2, 3]:
    plt.subplot(3, 3, map_from_action_to_subplot(a))
    plot_values(q[..., a], vmin=vmin - 0.05*dif, vmax=vmax + 0.05*dif)
    action_name = map_from_action_to_name(a)
    plt.title(r"$q(s, \mathrm{" + action_name + r"})$")
    
  plt.subplot(3, 3, 5)
  v = 0.9 * np.max(q, axis=-1) + 0.1 * np.mean(q, axis=-1)
  plot_values(v, colormap='summer', vmin=vmin, vmax=vmax)
  plt.title(r"$v(s), \mathrm{" + title + r"}$")
#   plt.savefig('./action_values_{}'.format(title))
#   plt.close()

def plot_greedy_policy(grid, title, q):
  action_names = [r"$\uparrow$",r"$\rightarrow$", r"$\downarrow$", r"$\leftarrow$"]
  greedy_actions = np.argmax(q, axis=2)
  grid.plot_grid(title)
  plt.hold('on')
  for i in range(grid._layout.shape[0]):
    for j in range(grid._layout.shape[1]):
      action_name = action_names[greedy_actions[i,j]]
      plt.text(j, i, action_name, ha='center', va='center')
In [32]:
def plot_completed_episodes(completed_train, completed_test, student, teacher, reward_signal):
  
  plt.figure(figsize=(12,12))
  plt.title('Cumulative Average Number of Train Episodes Completed; Student: {}, Teacher: {}, Reward Signal: {}'.format(student, teacher, reward_signal))
  plt.ylabel('Episodes Completed')
  plt.xlabel('Time')
  for x in range(completed_train.shape[0]):    
    plot = plt.plot(completed_train[x,:], label='Task '+str(x))
    #plt.fill_between(np.arange(smoothed_rewards[agent].shape[0]), smoothed_rewards[agent]-smoothed_rewards_stds[agent], smoothed_rewards[agent]+smoothed_rewards_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
  plt.legend(loc='upper right')
  
  plt.figure(figsize=(12,12))
  plt.title('Cumulative Average Number of Test Episodes Completed; Student: {}, Teacher: {}, Reward Signal: {}'.format(student, teacher, reward_signal))
  plt.ylabel('Episodes Completed')
  plt.xlabel('Time')
  for x in range(completed_test.shape[0]):    
    plot = plt.plot(completed_test[x,:], label='Task '+str(x))
    #plt.fill_between(np.arange(smoothed_rewards[agent].shape[0]), smoothed_rewards[agent]-smoothed_rewards_stds[agent], smoothed_rewards[agent]+smoothed_rewards_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
  plt.legend(loc='upper right')
In [33]:
def run_experiment(bandit, algs, tasks, number_of_steps_of_selecting_tasks, repetitions, vision_size, tabular, agent_type_driver, hidden_units, step_size, reward_signal):
  """Run multiple repetitions of a bandit experiment."""
  reward_dict = {}
  reward_delta_dict = {}
  action_dict = {}
  prob_dict = {}
  entropy_dict = {}
  
  for alg in algs:
    print('Running:', alg.name)
    reward_dict[alg.name] = []
    reward_delta_dict[alg.name] = []
    action_dict[alg.name] = []
    prob_dict[alg.name] = []
    entropy_dict[alg.name] = []
    
    rl_agent = None
    qs = None
    completed_train_episodes = np.zeros((len(tasks),number_of_steps_of_selecting_tasks))
    completed_test_episodes = np.zeros((len(tasks),number_of_steps_of_selecting_tasks))
    
    for qq in range(repetitions):
      print('Rep:', qq)
      
      if isinstance(alg, NEURAL_CONTROLLER_DRIVER):
        alg.reset()
        bandit = TaskSelector(alg, tasks, reward_signal, number_of_steps_of_selecting_tasks)

        reward_dict[alg.name].append([0.])
        reward_delta_dict[alg.name].append([])
        action_dict[alg.name].append([])
        prob_dict[alg.name].append([])
        entropy_dict[alg.name].append([])
        action = None
        reward = None
        prob = None
        entropy = None
        reward_delta = None
        capability = alg._initial_state_controller
        
        for i in range(number_of_steps_of_selecting_tasks):
          try:
            action = alg.step_controller(reward, 0.98, capability)
            prob = alg.getProbs()
            entropy = -1.0 * np.sum(prob * np.log(prob))
          except:
            raise ValueError(
                "The step function of algorithm `{}` failed.\
                Perhaps you have a bug, such as a typo.\
                Or, perhaps your value estimates or policy has diverged.\
                (E.g., internal quantities may have become NaNs.)\
                Try adding print statements to see if you can find a bug.".format(alg.name))
          reward, reward_from_environment, capability = bandit.step(action)
          bandit.resetReplayBuffer()
          reward_dict[alg.name][-1].append(reward_from_environment+reward_dict[alg.name][-1][-1])
          reward_delta_dict[alg.name][-1].append(reward)
          action_dict[alg.name][-1].append(action)
          prob_dict[alg.name][-1].append(prob.copy())
          entropy_dict[alg.name][-1].append(entropy)
        
        
        h, w = tasks[-1]._layout.shape
        obs = np.array([[tasks[-1].get_obs_at(x, y) for x in range(2, w-2)] for y in range(2, h-2)])
        if qs is not None:
          qs += np.array([[[alg.q_driver(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
        else:
          qs = np.array([[[alg.q_driver(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
        
      else:
        alg.reset()
        rl_agent = None
        
        if agent_type_driver == 'NEURALSARSA' or agent_type_driver == 'DQN':
          rl_agent = NS_DQN(teacher_student=False,
                      number_of_features=(2*vision_size + 1)**2,
                      number_of_hidden=hidden_units,
                      number_of_actions=4,
                      initial_state=tasks[0].get_obs(),
                      rl_alg=agent_type_driver,
                      step_size=step_size)
        elif agent_type_driver == 'Q':
          rl_agent = GeneralQ(number_of_states=tasks[0]._layout.size,
                  number_of_actions=4,
                  initial_state=tasks[0].get_obs(),
                  target_policy=Q_target_policy,
                  behaviour_policy=gen_behaviour_policy,
                  double=True)
        elif agent_type_driver == 'SARSA':
          rl_agent = GeneralQ(number_of_states=tasks[0]._layout.size,
                  number_of_actions=4,
                  initial_state=tasks[0].get_obs(),
                  target_policy=SARSA_target_policy,
                  behaviour_policy=gen_behaviour_policy,
                  double=True)
        
        bandit = TaskSelector(rl_agent, tasks, reward_signal, number_of_steps_of_selecting_tasks)
        
        reward_dict[alg.name].append([0.])
        reward_delta_dict[alg.name].append([])
        action_dict[alg.name].append([])
        prob_dict[alg.name].append([])
        entropy_dict[alg.name].append([])
        action = None
        reward = None
        prob = None
        entropy = None
        reward_delta = None
        capability = None
        if 'HYPER' in alg.name: 
          capability = alg._initial_state
        
        for i in range(number_of_steps_of_selecting_tasks):
          try:
            # This is for when the teacher is neural and student not
            if 'HYPER' in alg.name:
              action, _, _ = alg.step(reward, 0.98, capability)
            else:
              action = alg.step(action, reward)
            prob = alg.getProbs()
            entropy = -1.*np.sum(prob*np.log(prob))
          except:
            raise ValueError(
                "The step function of algorithm `{}` failed.\
                Perhaps you have a bug, such as a typo.\
                Or, perhaps your value estimates or policy has diverged.\
                (E.g., internal quantities may have become NaNs.)\
                Try adding print statements to see if you can find a bug.".format(alg.name))
          reward, reward_from_environment, capability = bandit.step(action)
          #print(reward_from_environment)
          bandit.resetReplayBuffer()
          
          reward_dict[alg.name][-1].append(reward_from_environment+reward_dict[alg.name][-1][-1])
          #print(reward_dict[alg.name][-1])
          reward_delta_dict[alg.name][-1].append(reward)
          action_dict[alg.name][-1].append(action)
          prob_dict[alg.name][-1].append(prob.copy())
          entropy_dict[alg.name][-1].append(entropy)
          
        if agent_type_driver == 'NEURALSARSA' or agent_type_driver == 'DQN':
          h, w = tasks[-1]._layout.shape
          obs = np.array([[tasks[-1].get_obs_at(x, y) for x in range(2, w-2)] for y in range(2, h-2)])
          if qs is not None:
            qs += np.array([[[rl_agent.q_noProbs(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
          else:
            qs = np.array([[[rl_agent.q_noProbs(o)[a] for a in range(4)] if o[vision_size,vision_size] == 0 else np.zeros((4,)) for o in ob] for ob in obs])
        elif agent_type_driver == 'Q' or agent_type_driver == 'SARSA':
          if qs is not None:
            qs += rl_agent.q_values.reshape(tasks[-1]._layout.shape + (4,))
          else:
            qs = rl_agent.q_values.reshape(tasks[-1]._layout.shape + (4,))
      #print('')      
      
#       print(completed_train_episodes.shape, bandit._tasks_episodes_completed_train.shape)
      completed_train_episodes += bandit._tasks_episodes_completed_train
      completed_test_episodes += bandit._tasks_episodes_completed_test
    
    
    completed_train_episodes /= repetitions
    completed_test_episodes /= repetitions
    qs /= repetitions
    
    if isinstance(alg, NEURAL_CONTROLLER_DRIVER):
      plot_completed_episodes(completed_train_episodes, completed_test_episodes, alg._rl_alg_driver, alg.name, reward_signal)
    else:
      plot_completed_episodes(completed_train_episodes, completed_test_episodes, agent_type_driver, alg.name, reward_signal)
    plot_action_values(qs, alg.name)
        
  return reward_dict, reward_delta_dict, action_dict, prob_dict, entropy_dict

def train_task_agents(agents, number_of_arms, number_of_steps_of_selecting_tasks, tasks, reward_signal, repetitions=1, vision_size=1, tabular=False, agent_type_driver='norm', hidden_units=100, step_size=0.01):
  bandit = None
  reward_dict, reward_delta_dict, action_dict, prob_dict, entropy_dict = run_experiment(bandit, agents, tasks, number_of_steps_of_selecting_tasks, repetitions, vision_size, tabular, agent_type_driver, hidden_units, step_size, reward_signal)
  
  smoothed_rewards = {}
  smoothed_rewards_stds = {}
  
  smoothed_reward_deltas = {}
  smoothed_reward_deltas_stds = {}
  
  smoothed_actions = {}
  
  smoothed_probs = {}
  
  smoothed_entropies = {}
  smoothed_entropies_stds = {}
  
  agent_set = set()
  
  for agent, rewards in reward_dict.items():
    agent_set.add(agent)
    smoothed_rewards[agent] = (np.sum(np.array([np.array(x) for x in rewards]), axis=0)).T
    smoothed_rewards_stds[agent] = (np.std(np.array([np.array(x) for x in rewards]), axis=0)).T
  
  for agent, rewards in reward_delta_dict.items():
    agent_set.add(agent)
    smoothed_reward_deltas[agent] = (np.sum(np.array([np.array(x) for x in rewards]), axis=0)).T
    smoothed_reward_deltas_stds[agent] = (np.std(np.array([np.array(x) for x in rewards]), axis=0)).T
  
  for agent, probs in prob_dict.items():
    smoothed_probs[agent] = (np.sum(np.array([np.array(x) for x in probs]), axis=0)).T

  for agent, entropies in entropy_dict.items():
    smoothed_entropies[agent] = (np.sum(np.array([np.array(x) for x in entropies]), axis=0)).T
    smoothed_entropies_stds[agent] = (np.std(np.array([np.array(x) for x in entropies]), axis=0)).T
    
  for agent in agent_set:
    smoothed_probs[agent] /= repetitions
    
    plt.figure(figsize=(44,40))
    plt.imshow(smoothed_probs[agent], interpolation=None)
    plt.title('Teacher: {}, Student: {}, Reward Signal: {}'.format(agent, agent_type_driver, reward_signal))
    plt.xlabel('Time')
    plt.ylabel('Task')
#     plt.savefig('./'+agent + ', Reward Signal: {}; {}'.format(reward_signal, agent_type_driver_driver))
#     plt.close()
  
  plt.figure(figsize=(12,12))
  plt.title('Cumulative Average Reward, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
  plt.ylabel('Reward')
  plt.xlabel('Time')
  for agent in agent_set:
    smoothed_rewards[agent] /= repetitions    
    plot = plt.plot(smoothed_rewards[agent], label=agent)
    plt.fill_between(np.arange(smoothed_rewards[agent].shape[0]), smoothed_rewards[agent]-smoothed_rewards_stds[agent], smoothed_rewards[agent]+smoothed_rewards_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
  plt.legend(loc='upper right')
#   plt.savefig('./Average Reward, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
#   plt.close()
  
  plt.figure(figsize=(12,12))
  plt.title('Delta Average Reward, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
  plt.ylabel('Delta')
  plt.xlabel('Time')
  for agent in agent_set:
    smoothed_reward_deltas[agent] /= repetitions    
    plt.plot(smoothed_reward_deltas[agent], label=agent)
  plt.legend(loc='upper right')
#   plt.savefig('./Delta Reward, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
#   plt.close()
  
  plt.figure(figsize=(12,12))
  plt.title('Entropy, Student: {}, Reward Signal: {}'.format(agent_type_driver, reward_signal))
  plt.ylabel('Policy Entropy')
  plt.xlabel('Time')
  for agent in agent_set:
    smoothed_entropies[agent] /= repetitions  
    plot = plt.plot(smoothed_entropies[agent], label=agent)
    plt.fill_between(np.arange(smoothed_entropies[agent].shape[0]), smoothed_entropies[agent]-smoothed_entropies_stds[agent], smoothed_entropies[agent]+smoothed_entropies_stds[agent], alpha=0.15, facecolor=plot[-1].get_color())
  plt.legend(loc='upper right')
#   plt.savefig('./Maximum Likelihood, \t Reward Signal: {}; {}'.format(reward_signal, agent_type_driver))
#   plt.close()
In [34]:
def run_step(env, agent, train):     
    env.resetState()
    agent.resetState()
    number_of_steps = env.distanceToGoal()
    try:
      action = agent.initial_action()
      agent_inventory = agent._inventory
    except AttributeError:
      action = 0
      agent_inventory = agent._inventory
    steps_completed = 0
    total_reward = 0.
    while steps_completed != number_of_steps:
      reward, discount, next_state, item = env.step(action, agent_inventory)
      
      if item != None:
        agent._inventory.add(item)
        
      total_reward += reward
      
      # Dont want to remove the key on train==True, cuz then cant get reward on train==false, where we record the reward
      if reward == 100 and train == False:
        agent._inventory.remove('KEY')
      
      if discount == 0:
        #print(total_reward)
        return total_reward, total_reward/steps_completed, True
      action, agent_inventory, _ = agent.step(reward, discount, next_state, item, train)
      steps_completed += 1
    
    mean_reward = total_reward/number_of_steps

    return total_reward, mean_reward, False
  
def run_episode(env, agent, number_of_episodes, train):
    # Mean Reward across all the episodes( aka all steps )
    mean_reward = 0.
    
    # Mean Duration per episode
    mean_duration = 0.
    
    # List of (Total Reward Per Epsiode)/(Duration)
    signal_per_episode = np.zeros((1, number_of_episodes))
    reward_per_episode = np.zeros((1, number_of_episodes))
    duration_per_episode = np.zeros((1, number_of_episodes))
    
    try:
      action = agent.initial_action()
    except AttributeError:
      action = 0
    
    episodes_completed = 0
    total_reward_per_episode = 0.
    
    start_time = time.time()
    
    i = 0.
    while episodes_completed != number_of_episodes:
      reward, discount, next_state = env.step(action)
      total_reward_per_episode += reward
      
      if discount == 0:
        duration = time.time() - start_time
        signal_per_episode[0,episodes_completed] = (total_reward_per_episode/duration)
        reward_per_episode[0,episodes_completed] = (total_reward_per_episode)
        duration_per_episode[0,episodes_completed] = (duration)
        
        episodes_completed += 1
        mean_duration += (duration - mean_duration)/(episodes_completed)
        
        start_time = time.time()
        total_reward_per_episode = 0.
        
      action = agent.step(reward, discount, next_state, train)
      mean_reward += (reward - mean_reward)/(i + 1.)
      i += 1.
    
    mean_signal = np.mean(signal_per_episode)
    total_reward = np.sum(reward_per_episode)
    total_duration = np.sum(duration_per_episode)

    return total_reward, total_duration

map_from_action_to_subplot = lambda a: (2, 6, 8, 4)[a]
map_from_action_to_name = lambda a: ("up", "right", "down", "left")[a]
  
def epsilon_greedy(q_values, epsilon):
  if epsilon < np.random.random():
    return np.argmax(q_values)
  else:
    return np.random.randint(np.array(q_values).shape[-1])

Reward as Reward Signal for Bandit(s)

NeuralRL/Bandit Controllers with Neural RL Agents

NeuralRL Controller state input is the buffered reward across all tasks within 5 timesteps

NeuralRL Controller does not have inventory as input

In [35]:
number_of_steps_of_selecting_tasks = 250
reps = 5
reward_signals=['MPG','SPG']
drivers = ['SARSA','Q']
hidden_units_controller_net = 100
hidden_units_driver_net = 100
In [36]:
vision_size = 1
tabular_grid = False
step_size = 0.01

# goal_loc has format (row, col)
tasks = []
#tasks.append(Hallway(goal_loc = [(8,2,-3), (2,2,5), (2,13,5), (8,13,100)], tabular=tabular_grid, vision_size=vision_size, discount=0.98))

tasks.append(Hallway(goal_loc = [(8,2,10),(2,2,5),(2,14,5),(8,14,100)], discount=0.98))

for task in tasks:
  task.plot_grid()
  
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)

agents = [
      Random(number_of_arms_tasks),
]

for driver in drivers:
  train_task_agents(agents,
                    number_of_arms_tasks,
                    number_of_steps_of_selecting_tasks, 
                    tasks,
                    'MPG',
                    reps,
                    vision_size,
                    tabular_grid,
                    driver,
                    hidden_units_driver_net)
('Running:', 'random')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 2)
PICKED UP KEY
('Rep:', 3)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 4)
('Running:', 'random')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
('Rep:', 2)
PICKED UP KEY
('Rep:', 3)
PICKED UP KEY
('Rep:', 4)
PICKED UP KEY
In [37]:
vision_size = 1
tabular_grid = False
step_size = 0.01

# goal_loc has format (row, col)
tasks = []
tasks.append(Hallway(goal_loc = [(8, 2, 10)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 2, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 14, 100)], discount=0.98))

for task in tasks:
  task.plot_grid()
  
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)

agents = [
    EXP3(number_of_arms_tasks, 0.2),
    REINFORCE(number_of_arms_tasks, baseline=False),
    REINFORCE(number_of_arms_tasks, baseline=True),
    NS_DQN(teacher_student=True,
           number_of_features=number_of_arms_tasks*5,
                      number_of_hidden=hidden_units_controller_net,
                      number_of_actions=number_of_arms_tasks,
                      initial_state=np.zeros((1,number_of_arms_tasks*5)),
                      rl_alg='DQN',
                      step_size=step_size),
    NS_DQN(teacher_student=True,
           number_of_features=number_of_arms_tasks*5,
                      number_of_hidden=hidden_units_controller_net,
                      number_of_actions=number_of_arms_tasks,
                      initial_state=np.zeros((1,number_of_arms_tasks*5)),
                      rl_alg='NEURALSARSA',
                      step_size=step_size),
    Random(number_of_arms_tasks),
]


for reward_signal in reward_signals:
  for driver in drivers:
    train_task_agents(agents,
                      number_of_arms_tasks,
                      number_of_steps_of_selecting_tasks, 
                      tasks,
                      reward_signal,
                      reps,
                      vision_size,
                      tabular_grid,
                      driver,
                      hidden_units_driver_net)
('Running:', 'exp3 Gamma: 0.2')
('Rep:', 0)
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: False')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: True')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER DQN')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'random')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
/usr/lib/python2.7/dist-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
('Running:', 'exp3 Gamma: 0.2')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: False')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: True')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER DQN')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
('Rep:', 2)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 3)
('Rep:', 4)
PICKED UP KEY
('Running:', 'random')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
PICKED UP KEY
('Rep:', 4)
('Running:', 'exp3 Gamma: 0.2')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 4)
('Running:', 'reinforce, baseline: False')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: True')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER DQN')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 2)
PICKED UP KEY
('Rep:', 3)
('Rep:', 4)
('Running:', 'random')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'exp3 Gamma: 0.2')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: False')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'reinforce, baseline: True')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER DQN')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
('Rep:', 1)
('Rep:', 2)
('Rep:', 3)
('Rep:', 4)
('Running:', 'random')
('Rep:', 0)
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 2)
('Rep:', 3)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 4)
In [41]:
number_of_steps_of_selecting_tasks = 250
reps = 2

# reward_signals=['MPG','SPG']
drivers = ['NEURALSARSA','DQN']

reward_signals=['SPG']


hidden_units_controller_net = 100
hidden_units_driver_net = 100
In [39]:
vision_size = 1
tabular_grid = False
step_size_controller = 0.01
step_size_driver = 0.01

# goal_loc has format (row, col)
tasks = []
tasks.append(Hallway(goal_loc = [(8, 2, 10)], tabular=tabular_grid, vision_size=vision_size,  discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 2, 5)], tabular=tabular_grid, vision_size=vision_size,  discount=0.98))
tasks.append(Hallway(goal_loc = [(2, 14, 5)], tabular=tabular_grid, vision_size=vision_size,  discount=0.98))
tasks.append(Hallway(goal_loc = [(8, 14, 100)], tabular=tabular_grid, vision_size=vision_size,  discount=0.98))


for task in tasks:
  task.plot_grid()
  
# Intrinsically Motivated Curriculum Learning
number_of_arms_tasks = len(tasks)

agents = [
  NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
                              (2*vision_size + 1)**2,
                              hidden_units_controller_net,
                              hidden_units_driver_net,
                              number_of_arms_tasks,
                              4,
                              np.zeros((1,number_of_arms_tasks*5)),
                              tasks[0].get_obs(),
                              'NEURALSARSA',
                              'DQN',
                              num_offline_updates_controller=25, 
                              num_offline_updates_driver=25,
                              step_size_controller=step_size_controller,
                              step_size_driver=step_size_driver),
  NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
                              (2*vision_size + 1)**2,
                              hidden_units_controller_net,
                              hidden_units_driver_net,
                              number_of_arms_tasks,
                              4,
                              np.zeros((1,number_of_arms_tasks*5)),
                              tasks[0].get_obs(),
                              'DQN',
                              'DQN',
                              num_offline_updates_controller=25, 
                              num_offline_updates_driver=25,
                              step_size_controller=step_size_controller,
                              step_size_driver=step_size_driver),
  NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
                              (2*vision_size + 1)**2,
                              hidden_units_controller_net,
                              hidden_units_driver_net,
                              number_of_arms_tasks,
                              4,
                              np.zeros((1,number_of_arms_tasks*5)),
                              tasks[0].get_obs(),
                              'DQN',
                              'NEURALSARSA',
                              num_offline_updates_controller=25, 
                              num_offline_updates_driver=25,
                              step_size_controller=step_size_controller,
                              step_size_driver=step_size_driver),
    NEURAL_CONTROLLER_DRIVER(number_of_arms_tasks*5,
                              (2*vision_size + 1)**2,
                              hidden_units_controller_net,
                              hidden_units_driver_net,
                              number_of_arms_tasks,
                              4,
                              np.zeros((1,number_of_arms_tasks*5)),
                              tasks[0].get_obs(),
                              'NEURALSARSA',
                              'NEURALSARSA',
                              num_offline_updates_controller=25, 
                              num_offline_updates_driver=25,
                              step_size_controller=step_size_controller,
                              step_size_driver=step_size_driver),
#     Random(number_of_arms_tasks),
]


for reward_signal in reward_signals:
  train_task_agents(agents,
                    number_of_arms_tasks,
                    number_of_steps_of_selecting_tasks, 
                    tasks,
                    reward_signal,
                    reps,
                    vision_size,
                    tabular_grid)
        
    
    
    
    
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Running:', 'HYPER DQN')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Running:', 'HYPER DQN')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Running:', 'HYPER NEURALSARSA')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
In [42]:
agents = [
    Random(number_of_arms_tasks),
]


for reward_signal in reward_signals:
  for driver in drivers:
    train_task_agents(agents,
                      number_of_arms_tasks,
                      number_of_steps_of_selecting_tasks, 
                      tasks,
                      reward_signal,
                      reps,
                      vision_size,
                      tabular_grid,
                      driver,
                      hidden_units_driver_net)
('Running:', 'random')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Running:', 'random')
('Rep:', 0)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
('Rep:', 1)
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY
PICKED UP KEY